## - ## 

# Replication code for 

# Bailo, F. (2015). 
# Mapping online political talks through network analysis: 
# A case study of the website of Italy’s Five Star Movement. 
# Policy Studies, 36(6), 550–572. 

# Paper available at http://www.tandfonline.com/doi/full/10.1080/01442872.2015.1095282

# Contact: francesco.bailo@sydney.edu.au

## - ## - ## 

# Libraries
require(igraph)
require(plyr)
require(sqldf)
require(ggplot2)
require(stringr)
require(Hmisc)
require(grid)
require(gridExtra)
require(xtable)
require(stargazer)
require(sandwich)
require(vcd)
require(AER)

# Load data
# Note: 
# Data is anonymised (usernames were replaced by numbers),
# live URLs to the discussion threads are included. 
load("m5sforum_feb2014_thread.RData")
load("m5sforum_feb2014_comment.RData")
load("m5sforum_feb2014_author.RData")

# Cut date
cut_date <- as.Date("2014-01-31")

# Descriptive statistics
m5s.cln.num_users <- as.character(nrow(author_omega))
m5s.cln.num_threads <- as.character(nrow(thread))
m5s.cln.num_comments <- as.character(nrow(comment))
m5s.cln.first_post <- format(min(as.Date(thread[['createdAt']]),na.rm = TRUE), "%e %B %Y")
m5s.cln.last_post <- format(max(as.Date(thread[['createdAt']]),na.rm = TRUE), "%e %B %Y")
m5s.cln.first_timestamp <- format(min(as.Date(thread[['timestamp']]),na.rm = TRUE), "%e %B %Y")
m5s.cln.last_timestamp <- format(max(as.Date(thread[['timestamp']]),na.rm = TRUE), "%e %B %Y")

m5s.cln.num_male_users <-nrow(subset(author_omega, gender=="male"))
m5s.cln.num_female_users <- nrow(subset(author_omega, gender=="female"))

## Bigraph creation ## 

# Create nodes dataframe for authors
nodes_author <- subset(author_omega, select = c("id","gender","member","timestamp"))
nodes_author$name <- NA
nodes_author$type <- FALSE
nodes_author$reactions <- NA
nodes_author$dislikes <- NA
nodes_author$createdAt <- NA
nodes_author$slug <- NA
nodes_author$postNumber <- NA
nodes_author$likes <- NA
nodes_author$message <- NA
nodes_author$score <- NA
nodes_author$categoryLink <- NA
nodes_author$url <- author_omega$id

# Create nodes dataframe for threads
nodes_thread <- subset(thread, 
                       select = c("link","title","reactions","dislikes","createdAt",
                                  "slug","postNumber","likes","message","score","categoryLink","timestamp"))
nodes_thread <- plyr::rename(nodes_thread, c("link"="id","title"="name"))
nodes_thread$type <- TRUE
nodes_thread$gender <- NA
nodes_thread$member <- NA
nodes_thread$url <- thread$link

# Merge two node sets
nodes <- rbind(nodes_author,nodes_thread)
# rm(nodes_author);rm(nodes_thread)

# Create edges dataframe for proposals
edges_proposal <- thread[,c("authorUrl","link","createdAt","title")]
edges_proposal <- plyr::rename(edges_proposal, c("authorUrl"="from", "link"="to"))
edges_proposal$type <- TRUE
edges_proposal$parent <- NA
edges_proposal$dislikes <- NA
edges_proposal$message <- NA
edges_proposal$numReports <- NA
edges_proposal$likes <- NA
edges_proposal$url <- thread$link
edges_proposal$textId <- thread$threadId

# Create edges dataframe for comments
edges_comment <- sqldf('SELECT 
                       comment.authorUrl AS source, 
                       thread.link AS target, 
                       comment.createdAt AS createdAt, 
                       comment.parent AS parent, 
                       comment.dislikes AS dislikes,
                       comment.message AS message,
                       comment.numReports AS numReports,
                       comment.likes AS likes,
                       comment.commentId AS textId,
                       thread.link AS url
                       FROM comment JOIN thread ON comment.threadId=thread.threadId')
edges_comment <- plyr::rename(edges_comment, c("source"="from","target"="to"))
edges_comment$type <- FALSE
edges_comment$title <- NA

# Merge two edge sets
edges <- rbind(edges_proposal, edges_comment)
# rm(edges_proposal); rm(edges_comment)

# Remove 2 non-matching edges
edges_no_match <- sum(is.na(match(edges$from,nodes$id))==TRUE)
edges <- edges[-which(is.na(match(edges$from,nodes$id))),]

#Create igraph object
igraph_bi <- graph.data.frame(edges, directed=TRUE, vertices=nodes)

# Calculate degree distributions of nodes (only for thread or "TRUE")
bi_degree <- degree(igraph_bi, v=V(igraph_bi)$type==TRUE, mode="in")

# Transform to table for ggplot
bi_degree.df <- data.frame(table(bi_degree=factor(bi_degree, levels=seq_len(max(bi_degree)))))
bi_degree.df$bi_degree <- as.numeric(as.character(bi_degree.df$bi_degree))

# Calculate degree distribution of random network (Erdős–Rényi)
igraph_bi.random <- bipartite.random.game(nrow(nodes_thread)%/%2, nrow(nodes_author)%/%2, type="gnm", m=nrow(edges)%/%2, directed=TRUE, mode="in")
bi_degree.random <- degree(igraph_bi.random, V(igraph_bi.random), mode="in")
bi_degree.random <- data.frame(table(bi_degree.random=factor(bi_degree.random, levels=seq_len(max(bi_degree.random)))))
bi_degree.random$bi_degree.random <- as.numeric(as.character(bi_degree.random$bi_degree.random))

# Calculate degree distribution of random network (Barabási–Albert)
igraph_bara.random <- ba.game(nrow(nodes))
degree_bara.random <- degree(igraph_bara.random, V(igraph_bara.random), mode="in")
degree_bara.random <- data.frame(table(degree_bara.random=factor(degree_bara.random, levels=seq_len(max(degree_bara.random)))))
degree_bara.random$degree_bara.random <- as.numeric(as.character(degree_bara.random$degree_bara.random))

# Calculate network clusters
bi_clusters <- clusters(igraph_bi, mode="weak")
bi_clusters.size <- data.frame(table(bi_clusters$csize))
bi_clusters.no <- no.clusters(igraph_bi, mode=c("weak"))
bi_max.cluster.size <- max(as.numeric(levels(bi_clusters.size$Var1)))

# Calculate degree distributions of nodes (only for user vertices or "FALSE")
bi_user_degree <- degree(igraph_bi, v=V(igraph_bi)$type==FALSE, mode="out")

# Transform to table for ggplot
bi_user_degree.df <- data.frame(table(bi_user_degree=factor(bi_user_degree, levels=seq_len(max(bi_user_degree)))))
bi_user_degree.df$bi_user_degree <- as.numeric(as.character(bi_user_degree.df$bi_user_degree))

# Simplify bigraph
V(igraph_bi)$size <- degree(igraph_bi, V(igraph_bi), mode="in")

# Simplify graph to remove double edges
igraph_bi_simp <- simplify(igraph_bi)


## Projection of bipartite graph (computationally intense and it creates a 3.5Gb object) ##
igraph_bi_projected <- bipartite.projection(igraph_bi_simp)
igraph_onemode1 <- igraph_bi_projected$proj1
igraph_onemode2 <- igraph_bi_projected$proj2


## One-mode projection: threads ##
## Either compute or load ##

# Cleaning
# category_attr <- get.vertex.attribute(igraph_onemode2, "categoryLink")
# replace <- category_attr==""
# V(igraph_onemode2)[replace]$categoryLink <- "No label"
# rm(replace)
# 
# V(igraph_onemode2)$category_size <- 1
# 
# g <- contract.vertices(igraph_onemode2, factor(V(igraph_onemode2)$categoryLink),
#                        vertex.attr.comb=list(categoryLink="first",
#                                              category_size="sum",
#                                              name="ignore",
#                                              gender="ignore",
#                                              member="ignore",
#                                              timestamp="ignore",
#                                              type="ignore",
#                                              reactions="ignore",
#                                              dislikes="ignore",
#                                              postNumber="ignore",
#                                              likes="ignore",
#                                              message="ignore",
#                                              score="ignore",
#                                              createdAt="ignore",
#                                              slug="ignore",
#                                              weight="ignore" ))
# 
# igraph_thread_categories <- simplify(g, remove.loops=FALSE)

# Load
load("m5sforum_feb2014_igraph_thread_categories.RData")

# Replace category labels
V(igraph_thread_categories)$categoryLink <- gsub("orum", "Forum", V(igraph_thread_categories)$categoryLink)
V(igraph_thread_categories)$categoryLink <- gsub("trasporti/viabilita", "transport\nroad conditions", V(igraph_thread_categories)$categoryLink)
V(igraph_thread_categories)$categoryLink <- gsub("acqua/acqua-pubblica", "water\npublic-water", V(igraph_thread_categories)$categoryLink)
V(igraph_thread_categories)$categoryLink <- gsub("acqua/risparmio", "water\nsaving", V(igraph_thread_categories)$categoryLink)
V(igraph_thread_categories)$categoryLink <- gsub("ambiente/inceneritori", "environment\nincinerators", V(igraph_thread_categories)$categoryLink)
V(igraph_thread_categories)$categoryLink <- gsub("ambiente/raccolta-differenziata", "environment\nrecycling", V(igraph_thread_categories)$categoryLink)
V(igraph_thread_categories)$categoryLink <- gsub("ambiente/rigassificatori", "environment\nLNG terminal", V(igraph_thread_categories)$categoryLink)
V(igraph_thread_categories)$categoryLink <- gsub("energia/energie-alternative", "energy\nalternative energy", V(igraph_thread_categories)$categoryLink)
V(igraph_thread_categories)$categoryLink <- gsub("energia/produzione", "energy\nproduction", V(igraph_thread_categories)$categoryLink)
V(igraph_thread_categories)$categoryLink <- gsub("energia/risparmio-energetico", "energy\nenergy saving", V(igraph_thread_categories)$categoryLink)
V(igraph_thread_categories)$categoryLink <- gsub("servizi-ai-cittadini/connettivita", "services\nconnectivity", V(igraph_thread_categories)$categoryLink)
V(igraph_thread_categories)$categoryLink <- gsub("servizi-ai-cittadini/per-gli-anziani", "services\nelders", V(igraph_thread_categories)$categoryLink)
V(igraph_thread_categories)$categoryLink <- gsub("servizi-ai-cittadini/per-i-bambini", "services\nchildren", V(igraph_thread_categories)$categoryLink)
V(igraph_thread_categories)$categoryLink <- gsub("servizi-ai-cittadini/per-i-giovani", "services\nyouth", V(igraph_thread_categories)$categoryLink)
V(igraph_thread_categories)$categoryLink <- gsub("servizi-ai-cittadini", "services to citizens", V(igraph_thread_categories)$categoryLink)
V(igraph_thread_categories)$categoryLink <- gsub("sviluppo/appalti-pubblici", "development\npublic contracting", V(igraph_thread_categories)$categoryLink)
V(igraph_thread_categories)$categoryLink <- gsub("sviluppo/commercio", "development\ntrade", V(igraph_thread_categories)$categoryLink)
V(igraph_thread_categories)$categoryLink <- gsub("sviluppo/edilizia-sostenibile", "development\nsustainable construction", V(igraph_thread_categories)$categoryLink)
V(igraph_thread_categories)$categoryLink <- gsub("sviluppo/lavoro", "development\njobs", V(igraph_thread_categories)$categoryLink)
V(igraph_thread_categories)$categoryLink <- gsub("sviluppo/territorio", "development\nterritory", V(igraph_thread_categories)$categoryLink)
V(igraph_thread_categories)$categoryLink <- gsub("sviluppo/urbanistica", "development\ncity planning", V(igraph_thread_categories)$categoryLink)
V(igraph_thread_categories)$categoryLink <- gsub("trasporti/mezzi-pubblici", "transport\npublic transport", V(igraph_thread_categories)$categoryLink)
V(igraph_thread_categories)$categoryLink <- gsub("sviluppo", "development", V(igraph_thread_categories)$categoryLink)
V(igraph_thread_categories)$categoryLink <- gsub("energia", "energy", V(igraph_thread_categories)$categoryLink)
V(igraph_thread_categories)$categoryLink <- gsub("ambiente", "environment", V(igraph_thread_categories)$categoryLink)


## Group categories ##

# Create new category to store parent label
V(igraph_thread_categories)$parentCategory <- V(igraph_thread_categories)$categoryLink

# Parse first word of string and store it in $parentCategory
regex <- perl("(?<=^)\\w\\w+")
V(igraph_thread_categories)$parentCategory <- capitalize(str_extract(V(igraph_thread_categories)$categoryLink,regex))
V(igraph_thread_categories)$parentCategory <- gsub("No", "No label", V(igraph_thread_categories)$parentCategory)
rm(regex)

# Contract vertices on $parentCategory
g <- contract.vertices(igraph_thread_categories,
                       factor(V(igraph_thread_categories)$parentCategory),
                       vertex.attr.comb=list(parentCategory="first",
                                             category_size="sum",
                                             weight="sum",
                                             categoryLink="ignore"))

igraph_thread_parentCategories <- simplify(g, remove.loops=FALSE)


## User communities ##
## Either compute or load ##

# Create user communites graph (only for communities with more than 20 members)
# V(igraph_onemode1)$community_size <- 1
# cg <- contract.vertices(igraph_onemode1, membership(igraph_users.fastgreedy), vertex.attr.comb=list("community_size"="sum","first"))
# cgsub <- induced.subgraph(cg,V(cg)$community_size>20)
# rm(cg)
# E(cgsub)$weight <- 1
# igraph_user_communities <- simplify(cgsub, remove.loops=FALSE)
# rm(cgsub)
load("m5sforum_feb2014_igraph_user_communities.RData")

## - ##


## Sample user graph and run poisson regression

# Following part is commented because RData files are also provided
# Unfortunately I did not set a seed so if run again will produce
# slightly different results
# sample <- sample(vcount(igraph_onemode2), 10000)
# graph_train <- induced.subgraph(igraph_onemode2, sample)
# train_predictors <- edgelist_df(graph_train)
# train_predictors <- data.frame(train_predictors)
# Regression Analysis
# lm.fit <- glm(e_weight~mult_size+time_diff, data=train_predictors, family=poisson)
load("train_predictors_lm.fit.RData")
load("train_predictors.RData")


## FIGURE 1 ##

thread.as.date = as.Date(thread[['createdAt']])
comment.as.date = as.Date(comment[['createdAt']])
week_vector <- seq(from = as.Date("2009-02-11"), to = as.Date(cut_date - 7), by = 7)

ts_threads.week <- as.data.frame(table(cut(thread.as.date, breaks = week_vector)))
ts_comments.week <- as.data.frame(table(cut(comment.as.date, breaks = week_vector)))

# First national assembly
line1 = as.numeric(as.Date("2009-03-08"))
# First regional election
line2 = as.numeric(as.Date("2010-03-29"))
# Local elections
line3 = as.numeric(as.Date("2011-05-16"))
# Local elections
line4 = as.numeric(as.Date("2012-05-07"))
# Sicilian elections
line5 = as.numeric(as.Date("2012-10-28"))
# National elections
line6 = as.numeric(as.Date("2013-02-25"))

theme_set(theme_bw() + theme(text=element_text(family="Palatino", size=10)))

ggplot(ts_threads.week, aes(x=as.Date(Var1), y=Freq,)) + geom_line(size=.2) + 
  labs(x=NULL) +
  ylab("Proposals") +
  theme(
    plot.margin = unit(c(-1,0.5,0.5,0.5), "lines"),
    panel.background = element_blank(),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_blank(),
    axis.line = element_line(colour = "gray")) +
  geom_vline(xintercept=line1, linetype="dotted", size=.2) +
  geom_vline(xintercept=line2, linetype="dotted", size=.2) +
  geom_vline(xintercept=line3, linetype="dotted", size=.2) +
  geom_vline(xintercept=line4, linetype="dotted", size=.2) +
  geom_vline(xintercept=line5, linetype="dotted", size=.2) +
  geom_vline(xintercept=line6, linetype="dotted", size=.2) +
  geom_area()

ggplot(ts_comments.week, aes(x=as.Date(Var1), y=Freq)) + geom_line(size=.2) +
  labs(x=NULL) +
  ylab("Comments") +
  ggplot2::annotate("text", x=as.Date(line1, origin = "1970-01-01"), y=6000, 
           label="National\nassembly", 
           family= theme_get()$text[["family"]], 
           size=theme_get()$text[["size"]]/2.5, 
           fontface="italic") +
  ggplot2::annotate("text", x=as.Date(line2, origin = "1970-01-01"), y=6000, 
           label="Regional\nelections",
           family= theme_get()$text[["family"]], 
           size=theme_get()$text[["size"]]/2.5, 
           fontface="italic") +
  ggplot2::annotate("text", x=as.Date(line3, origin = "1970-01-01"), y=6000, 
           label="Local\nelections", 
           family= theme_get()$text[["family"]], 
           size=theme_get()$text[["size"]]/2.5, 
           fontface="italic") +
  ggplot2::annotate("text", x=as.Date(line4, origin = "1970-01-01"), y=6000, 
           label="Local\nelections",
           family= theme_get()$text[["family"]], 
           size=theme_get()$text[["size"]]/2.5, 
           fontface="italic") +
  ggplot2::annotate("text", x=as.Date(line5, origin = "1970-01-01"), y=9300, 
           label="Sicilian\nelections",
           family= theme_get()$text[["family"]], 
           size=theme_get()$text[["size"]]/2.5, 
           fontface="italic") +
  ggplot2::annotate("text", x=as.Date(line6, origin = "1970-01-01"), y=24000, 
           label="General elections",
           family= theme_get()$text[["family"]], 
           size=theme_get()$text[["size"]]/2.5, 
           fontface="italic") +
  geom_vline(xintercept=line1, linetype="dotted", size=.2) +
  geom_vline(xintercept=line2, linetype="dotted", size=.2) +
  geom_vline(xintercept=line3, linetype="dotted", size=.2) +
  geom_vline(xintercept=line4, linetype="dotted", size=.2) +
  geom_vline(xintercept=line5, linetype="dotted", size=.2) +
  geom_vline(xintercept=line6, linetype="dotted", size=.2) +
  theme(axis.text.x=element_blank(),
        axis.title.x=element_blank(),
        plot.title=element_blank(),
        axis.ticks.x=element_blank(),
        plot.margin = unit(c(0.5,0.5,0.5,0.5), "lines"),
        panel.background = element_blank(),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.border = element_blank(),
        axis.line = element_line(colour = "gray")) +
  geom_area()


## FIGURE 2 ##

author_omega$gender[which(author_omega$gender=="undefined")] <- "unknown"

# User's gender
ggplot(subset(author_omega, gender=="male" | gender=="female" | gender=="unknown"), aes(x=gender, fill = gender)) +
  geom_bar() +
  labs(x=NULL, y=NULL) +
  scale_fill_manual("gender", 
                    values=c("red", "blue", "grey")) +
  guides(fill=FALSE)
#theme(text = element_text(size=24))

# Gender by thread
ggplot(subset(sqldf("SELECT author_omega.gender AS gender FROM author_omega JOIN thread ON author_omega.id=thread.authorUrl"), gender=="male" | gender=="female" | gender=="unknown"), aes(x=gender, fill = gender)) +
  geom_bar() +
  labs(x=NULL, y=NULL) +
  scale_fill_manual("gender", 
                    values=c("red", "blue","grey")) +
  guides(fill=FALSE)
#theme(text = element_text(size=12))

# Gender by comment
ggplot(subset(sqldf("SELECT author_omega.gender AS gender FROM comment JOIN author_omega ON author_omega.id=comment.authorUrl"), gender=="male" | gender=="female" | gender=="unknown"), aes(x=gender, fill = gender)) +
  geom_bar() +
  labs(x=NULL, y=NULL) +
  scale_fill_manual("gender", 
                    values=c("red", "blue","grey")) +
  guides(fill=FALSE) 


## UNPUBLISHED FIGURE (Thread categories) ##

df <- data.frame(table(thread$categoryLink))
df$Var1 <- as.factor(df$Var1)
df$Var1 <- gsub("orum", "Forum", df$Var1)
df$Var1 <- gsub("^*$", "None", df$Var1)

df$Var1 <- gsub("trasporti/viabilita", "transport / road conditions", df$Var1)
df$Var1 <- gsub("acqua/acqua-pubblica", "water / public-water", df$Var1)
df$Var1 <- gsub("acqua/risparmio", "water / saving", df$Var1)
df$Var1 <- gsub("ambiente/inceneritori", "enviroment / incinerators", df$Var1)
df$Var1 <- gsub("ambiente/raccolta-differenziata", "enviroment / recycling", df$Var1)
df$Var1 <- gsub("ambiente/rigassificatori", "enviroment / LNG terminal", df$Var1)
df$Var1 <- gsub("energia/energie-alternative", "energy / alternative energy", df$Var1)
df$Var1 <- gsub("energia/produzione", "energy / production", df$Var1)
df$Var1 <- gsub("energia/risparmio-energetico", "energy / energy saving", df$Var1)
df$Var1 <- gsub("servizi-ai-cittadini/connettivita", "services / connectivity", df$Var1)
df$Var1 <- gsub("servizi-ai-cittadini/per-gli-anziani", "services / elders", df$Var1)
df$Var1 <- gsub("servizi-ai-cittadini/per-i-bambini", "services / children", df$Var1)
df$Var1 <- gsub("servizi-ai-cittadini/per-i-giovani", "services / youth", df$Var1)
df$Var1 <- gsub("servizi-ai-cittadini", "services to citizens", df$Var1)
df$Var1 <- gsub("sviluppo/appalti-pubblici", "development / public contracting", df$Var1)
df$Var1 <- gsub("sviluppo/commercio", "development / trade", df$Var1)
df$Var1 <- gsub("sviluppo/edilizia-sostenibile", "development / sustainable construction", df$Var1)
df$Var1 <- gsub("sviluppo/lavoro", "development / jobs", df$Var1)
df$Var1 <- gsub("sviluppo/territorio", "development / territory", df$Var1)
df$Var1 <- gsub("sviluppo/urbanistica", "development / city planning", df$Var1)
df$Var1 <- gsub("trasporti/mezzi-pubblici", "transport / public transport", df$Var1)
df$Var1 <- gsub("sviluppo", "development", df$Var1)
df$Var1 <- gsub("energia", "energy", df$Var1)
df$Var1 <- gsub("ambiente", "environment", df$Var1)
df$Var1 <- gsub("acqua", "water", df$Var1)

ggplot(df,(aes(y=Freq, x=Var1))) + geom_histogram(stat="identity", binwidth=0.1) + coord_flip() + labs(y=NULL, x=NULL) +
  scale_x_discrete(limits=c("None","Forum","water / saving","water / public-water","transport / road conditions","transport / public transport","services to citizens","services / youth","services / elders","services / children","enviroment / recycling","enviroment / incinerators","enviroment / LNG terminal","environment","energy / production","energy / energy saving","energy / alternative energy","energy","development / trade","development / territory","development / sustainable construction","development / public contracting","development / jobs","development / city planning","development"))


## UNPUBLISHED FIGURE (Digree distribution of forum and two random graphs) ##

# Plots
ggplot(bi_degree.df, aes(x=bi_degree, y=Freq)) +
  xlab("Degree (log)") +
  ylab("Frequency") +
  geom_line() + scale_x_log10()

ggplot(bi_degree.random, aes(x=bi_degree.random, y=Freq)) +
  xlab("Degree (log)") +
  labs(y=NULL) +
  geom_line() + scale_x_log10()

ggplot(degree_bara.random, aes(x=degree_bara.random, y=Freq)) +
  xlab("Degree (log)") +
  labs(y=NULL) +
  geom_line() + scale_x_log10()


## UNPUBLISHED FIGURE (Ratio common degree) ##

dg <- c("1","2","3","4","5","6","7","8+")
fq <- c(sum(bi_degree==1)/length(bi_degree)*100,
        sum(bi_degree==2)/length(bi_degree)*100,
        sum(bi_degree==3)/length(bi_degree)*100,
        sum(bi_degree==4)/length(bi_degree)*100,
        sum(bi_degree==5)/length(bi_degree)*100,
        sum(bi_degree==6)/length(bi_degree)*100,
        sum(bi_degree==7)/length(bi_degree)*100,
        sum(bi_degree>=8)/length(bi_degree)*100)

df <- data.frame(dg,fq)

ggplot(df, aes(y=fq, x=dg)) +
  geom_bar(stat="identity") +
  guides(fill=FALSE) +
  labs(y="%", x="Degree") +
  coord_flip()


rm(df); rm (dg); rm(fq)


## UNPUBLISHED FIGURE (CLUSTER SIZES) ##

ggplot(bi_clusters.size, aes(x=as.factor(Var1), y=Freq)) +
  xlab("Size") +
  ylab("Frequency") +
  geom_histogram(stat="identity") +
  annotate("segment", x=13, xend=13, y=2000, yend=500,
           size=0.5, arrow=arrow(length = unit(0.2, "cm"))) + coord_flip()


## FIGURE 8 ##

# One node is mislabelled
igraph_thread_parentCategories <- 
  delete.vertices(igraph_thread_parentCategories, V(igraph_thread_parentCategories)$parentCategory=="Acqua")

par(mar=c(0,0,0,0))
layout <- layout.circle(igraph_thread_parentCategories)
plot(igraph_thread_parentCategories,
     layout=layout,
     vertex.label=NA,
     vertex.frame.color=NA,
     edge.width=(E(igraph_thread_parentCategories)$weight)/300000,
     vertex.size=log(V(igraph_thread_parentCategories)$category_size)*2)
plot(igraph_thread_parentCategories,
     add=TRUE,
     layout=layout,
     vertex.label.color="black",
     edge.label.color="black",
     vertex.label.family="Palatino",
     edge.label.family="Palatino",
     edge.label.cex=0.6,
     edge.loop.angle=0.1,
     # edge.label=sub("[ ]+", "", format(E(igraph_thread_parentCategories)$weight, big.mark=",", scientific=FALSE)),
     edge.width=0.001,
     vertex.label.cex=log(V(igraph_thread_parentCategories)$category_size)/8,
     vertex.shape="none",
     vertex.label=V(igraph_thread_parentCategories)$parentCategory,
     edge.width=(E(igraph_thread_parentCategories)$weight)/300000,
     vertex.size=log(V(igraph_thread_parentCategories)$category_size)*2)


## FIGURE 10 ##

par(mar=c(0,0,0,0))
layout=layout.circle(igraph_user_communities)
plot(igraph_user_communities,
     layout=layout,
     vertex.frame.color=NA,
     vertex.size=log(V(igraph_user_communities)$community_size)+7,
     edge.width=log(E(igraph_user_communities)$weight),
     edge.label=NA,
     vertex.label=NA)
plot(igraph_user_communities,
     vertex.label.color="black",
     edge.label.color="black",
     add=TRUE,
     layout=layout,
     vertex.label.family="Palatino",
     edge.label.family="Palatino",
     edge.label.cex=0.7,
     edge.loop.angle=0.1,
     edge.width=0.01,
     vertex.shape="none",
     vertex.label=V(igraph_user_communities)$community_size) 


## Immigration threads ##

immigration_threads_coded <- read.csv("immigration_threads_coded.csv")

# Selected urls
immigration_ids <- immigration_threads_coded$id
immigration_urls <- thread$link[match(immigration_ids,thread$threadId)]

vertices <- integer()

for (i in immigration_urls) {
  vertices <- c(vertices, which(V(igraph_bi)$url==i))
}

for (i in immigration_urls) {
  vertices <- c(vertices, unlist(
    neighborhood(igraph_bi, 1, which(V(igraph_bi)$url==i), mode=c("in"))))
}

bg <- induced.subgraph(igraph_bi, vertices)
V(bg)$size <- degree(bg, V(bg), mode="in")

bg_simp <- simplify(bg)
V(bg_simp)$size <- degree(bg_simp, V(bg_simp), mode="in")

pg <- bipartite.projection(bg)

# UNPUBLISHED FIGURE 
plot(pg$proj2,
     vertex.label=V(pg$proj2)$size,
     layout=layout.circle(pg$proj2),
     vertex.size=log(V(pg$proj2)$size),
     edge.label=E(pg$proj2)$weight)

pg_simp <- bipartite.projection(bg_simp)


# Correct encoding error
immigration_threads_coded$title <- gsub("Ã ", "à",  immigration_threads_coded$title)

# Loop and assign colorcode
for (i in 1:vcount(pg_simp$proj2)) {
  
  V(pg_simp$proj2)$color[i] <- as.character(immigration_threads_coded$colorcode[match(
    V(pg_simp$proj2)$name[i],immigration_threads_coded$title)
    ])
  
}

# UNPUBLISHED FIGURE
plot(pg_simp$proj2,
     vertex.label=V(pg_simp$proj2)$size,
     layout=layout.fruchterman.reingold(pg$proj2,niter=500,area=vcount(pg$proj2)^2.3,repulserad=vcount(pg$proj2)^2.8),
     vertex.size=V(pg_simp$proj2)$size*0.3,
     edge.label=E(pg_simp$proj2)$weight,
     edge.width=E(pg_simp$proj2)$weight)

# Edge list & attributes
v_pairs <- get.edges(pg_simp$proj2, E(pg_simp$proj2))
left_size <- as.integer(V(pg_simp$proj2)$size[v_pairs[,1]])
right_size <- as.integer(V(pg_simp$proj2)$size[v_pairs[,2]])
left_time <- as.integer(as.POSIXct(V(pg_simp$proj2)$createdAt[v_pairs[,1]],
                                   format="%Y-%m-%dT%H:%M:%S", 
                                   tz="UTC"))
right_time <- as.integer(as.POSIXct(V(pg_simp$proj2)$createdAt[v_pairs[,2]],
                                    format="%Y-%m-%dT%H:%M:%S", 
                                    tz="UTC"))
e_weight <- as.integer(E(pg_simp$proj2)$weight)
time_diff <- abs(left_time - right_time)
rm(right_time,left_time)

# Create predictor df
edge_predictors <- e_weight
# rm(e_weight)
edge_predictors <- cbind(edge_predictors, left_size)
rm(left_size)
colnames(edge_predictors)[1] <- "e_weight"
edge_predictors <- cbind(edge_predictors, right_size)
rm(right_size)
mult_size <- edge_predictors[,2]*edge_predictors[,3]
edge_predictors <- cbind(edge_predictors, mult_size)
rm(mult_size)
edge_predictors <- cbind(edge_predictors, time_diff)
rm(time_diff)
edge_predictors <- data.frame(edge_predictors)

# Predict "expected" values
pois.pr <- predict(lm.fit, newdata=edge_predictors, se.fit=TRUE)
pois.fit <- cbind(pois.pr$fit, pois.pr$se.fit)

# Create function to generate confidence intervals 
# for poisson regression. It expects two column matrix
pois.confint <- function(pois.fit){
  fit <- pois.fit[,1]
  se.fit <- pois.fit[,2]
  ll <- numeric()
  ul <- numeric()
  for (i in 1:length(fit)) {
    ll <- c(ll, fit[i]-1.96*se.fit[i])
    ul <- c(ul, fit[i]+1.96*se.fit[i])
  }
  m <- cbind(fit, ll, ul)
  return(m)
}

prediction <- pois.confint(pois.fit)

prediction <- cbind(prediction, e_weight)

# Function to calculate offset
calc_offset <- function(x) {
  if (x[2]<=x[4]&x[4]<=x[3]) {return(0)}
  if (x[4]>x[3]) {return(x[4]-x[3])}
  if (x[4]<x[2]) {return(-(x[2]-x[4]))}
}

offset <- round(as.numeric(apply(prediction, 1, calc_offset)),digits=2)

# Add results to graph

E(pg_simp$proj2)$offset <- offset

# UNPUBLISHED FIGURE
plot(pg_simp$proj2,
     vertex.label=V(pg_simp$proj2)$size,
     layout=layout.fruchterman.reingold(pg$proj2,niter=500,area=vcount(pg$proj2)^2.3,repulserad=vcount(pg$proj2)^2.8),
     vertex.size=V(pg_simp$proj2)$size*0.3,
     edge.label=E(pg_simp$proj2)$offset,
     edge.width=E(pg_simp$proj2)$weight)

immigration_graph <- pg_simp$proj2

## Use n equally spaced breaks to assign each value to n-1 equal sized bins 
ii <- cut(E(immigration_graph)$offset, breaks = seq(min(E(immigration_graph)$offset), max(E(immigration_graph)$offset), len = 100), 
          include.lowest = TRUE)
## Use bin indices, ii, to select color from vector of n-1 equally spaced colors
E(immigration_graph)$color <- colorRampPalette(c("lightgray", "black"))(99)[ii]

E(immigration_graph)$color[E(immigration_graph)$offset<0] <- "yellow"


# Set color edges
# for (i in E(immigration_graph)) {
#  if (E(immigration_graph)$offset[i]==0) {color <- "lightgrey"}
#  if (E(immigration_graph)$offset[i]>0) {color <- "#FFFF00"}
#  if (E(immigration_graph)$offset[i]<0) {color <- "#FF0000"}
#  E(immigration_graph)$color[i] <-color
# }


## FIGURE 11 ##
par(mar=c(0,0,0,0))
plot(immigration_graph,
     layout=layout.circle,
     vertex.frame.color=NA,
     edge.label.color="black",
     # edge.color=range,
     vertex.label.family="Palatino",
     vertex.size=log(V(immigration_graph)$size)*3,
     vertex.label.color="black",
     edge.label.family="Palatino",
     edge.label.cex=0.9,
     edge.width=E(immigration_graph)$weight,
     # edge.label=E(immigration_graph)$offset,
     vertex.label=format(as.Date(V(immigration_graph)$createdAt), "%b %y"))


## Coalition thread ##

# Confidence vote analysis

coalition_url <- c("http://www.beppegrillo.it/listeciviche/forum/2013/02/fiducia-a-bersani-al-senato-e-poi-voti-secondo-coscienza-3.html",
                   "http://www.beppegrillo.it/listeciviche/forum/2013/02/apertura-pd.html",
                   "http://www.beppegrillo.it/listeciviche/forum/2013/02/unoccasione-concreta-per-lagenda-grillo.html",
                   "http://www.beppegrillo.it/listeciviche/forum/2013/02/tanto-salviamo-gli-italiani-poi-vedremo.html",
                   "http://www.beppegrillo.it/listeciviche/forum/2013/03/votare-la-fiducia-4.html",
                   "http://www.beppegrillo.it/listeciviche/forum/2013/03/li-abbiamo-eletti-per-mandarli-a-casa-non-per-aiutarlivota-si-cliccando-la-stellina.html",
                   "http://www.beppegrillo.it/listeciviche/forum/2013/03/votate-qui-se-volete-la-fiducia.html",
                   "http://www.beppegrillo.it/listeciviche/forum/2013/03/mi-avete-deluso.html",
                   "http://www.beppegrillo.it/listeciviche/forum/2013/04/controproposta-ai-delusi-ed-agli-infiltrati.html",
                   "http://www.beppegrillo.it/listeciviche/forum/2013/04/a-grillogrilloquanti-errori-vedi-in-fiuli-dal-27-al-14.html",
                   "http://www.beppegrillo.it/listeciviche/forum/2013/04/friuli-crash-questa-e-stata-la-risposta-dellelettorato.html",
                   "http://www.beppegrillo.it/listeciviche/forum/2013/04/abbiamo-fatto-bene.html",
                   "http://www.beppegrillo.it/listeciviche/forum/2013/04/grazie-grillo-e-tutti-i-mister-no-per-questa-fine-ingloriosa.html")

# timestamp <- c("2013-08-24 21:23:10",
#               "2013-08-23 23:28:32",
#               "2013-08-26 19:11:00",
#               "2013-08-26 17:53:34",
#               "2013-05-16 20:25:00",
#               "2013-08-28 17:17:12",
#               "2013-08-29 21:42:43",
#               "2013-08-28 18:54:29",
#               "2013-08-30 18:31:41",
#               "2013-08-29 22:06:00",
#               "2013-08-30 21:35:56",
#               "2013-08-29 22:11:51",
#               "2013-08-30 22:04:37")


# matrix <- cbind(url,opinion,color)

# rm(timestamp);rm(opinion);rm(color)

vertices <- integer()

for (i in coalition_url) {
  vertices <- c(vertices, which(V(igraph_bi)$url==i))
}

for (i in coalition_url) {
  vertices <- c(vertices, unlist(
    neighborhood(igraph_bi, 1, which(V(igraph_bi)$url==i), mode=c("in"))))
}

bg <- induced.subgraph(igraph_bi, vertices)
V(bg)$size <- degree(bg, V(bg), mode="in")

bg_simp <- simplify(bg)
V(bg_simp)$size <- degree(bg_simp, V(bg_simp), mode="in")

pg_simp <- bipartite.projection(bg_simp)

V(pg_simp$proj2)$opinion <- c("pro",
                              "pro",
                              "pro",
                              "pro",
                              "con",
                              "con",
                              "pro",
                              "pro",
                              "con",
                              "pro",
                              "pro",
                              "neutral",
                              "pro")

V(pg_simp$proj2)$color <- c("red1",
                            "red1",
                            "red1",
                            "red1",
                            "royalblue1",
                            "royalblue1",
                            "red1",
                            "red1",
                            "royalblue1",
                            "red1",
                            "red1",
                            "grey",
                            "red1")

# Edge list & attributes
v_pairs <- get.edges(pg_simp$proj2, E(pg_simp$proj2))
left_size <- as.integer(V(pg_simp$proj2)$size[v_pairs[,1]])
right_size <- as.integer(V(pg_simp$proj2)$size[v_pairs[,2]])
left_time <- as.integer(as.POSIXct(V(pg_simp$proj2)$createdAt[v_pairs[,1]],
                                   format="%Y-%m-%dT%H:%M:%S", 
                                   tz="UTC"))
right_time <- as.integer(as.POSIXct(V(pg_simp$proj2)$createdAt[v_pairs[,2]],
                                    format="%Y-%m-%dT%H:%M:%S", 
                                    tz="UTC"))
e_weight <- as.integer(E(pg_simp$proj2)$weight)
time_diff <- abs(left_time - right_time)
rm(right_time,left_time)

# Create predictor df
edge_predictors <- e_weight
# rm(e_weight)
edge_predictors <- cbind(edge_predictors, left_size)
rm(left_size)
colnames(edge_predictors)[1] <- "e_weight"
edge_predictors <- cbind(edge_predictors, right_size)
rm(right_size)
mult_size <- edge_predictors[,2]*edge_predictors[,3]
edge_predictors <- cbind(edge_predictors, mult_size)
rm(mult_size)
edge_predictors <- cbind(edge_predictors, time_diff)
rm(time_diff)
edge_predictors <- data.frame(edge_predictors)

# Predict "expected" values
pois.pr <- predict(lm.fit, newdata=edge_predictors, se.fit=TRUE)
pois.fit <- cbind(pois.pr$fit, pois.pr$se.fit)

# Create function to generate confidence intervals 
# for poisson regression. It expects two column matrix
pois.confint <- function(pois.fit){
  fit <- pois.fit[,1]
  se.fit <- pois.fit[,2]
  ll <- numeric()
  ul <- numeric()
  for (i in 1:length(fit)) {
    ll <- c(ll, fit[i]-1.96*se.fit[i])
    ul <- c(ul, fit[i]+1.96*se.fit[i])
  }
  m <- cbind(fit, ll, ul)
  return(m)
}

prediction <- pois.confint(pois.fit)

prediction <- cbind(prediction, e_weight)

# Function to calculate offset
calc_offset <- function(x) {
  if (x[2]<=x[4]&x[4]<=x[3]) {return(0)}
  if (x[4]>x[3]) {return(x[4]-x[3])}
  if (x[4]<x[2]) {return(-(x[2]-x[4]))}
}

offset <- round(as.numeric(apply(prediction, 1, calc_offset)),digits=2)

# Add results to graph

E(pg_simp$proj2)$offset <- offset

confidence_vote_graph <- pg_simp$proj2

# Simplify
# E(confidence_vote_graph)$weight2 <- E(confidence_vote_graph)$weight
# E(confidence_vote_graph)$weight2[E(confidence_vote_graph)$weight<500] <- NA

# Set color edges
# for (i in E(confidence_vote_graph)) {
#   if (E(confidence_vote_graph)$offset[i]==0) {color <- "lightgrey"}
#   if (E(confidence_vote_graph)$offset[i]>0) {color <- "#FFFF00"}
#   if (E(confidence_vote_graph)$offset[i]<0) {color <- "#FF0000"}
#   E(confidence_vote_graph)$color[i] <-color
# }

## Use n equally spaced breaks to assign each value to n-1 equal sized bins 
ii <- cut(E(confidence_vote_graph)$offset, breaks = seq(min(E(confidence_vote_graph)$offset), max(E(confidence_vote_graph)$offset), len = 100), 
          include.lowest = TRUE)
## Use bin indices, ii, to select color from vector of n-1 equally spaced colors
E(confidence_vote_graph)$color <- colorRampPalette(c("lightgray", "black"))(99)[ii]

# Simplify
E(confidence_vote_graph)$offset2 <- E(confidence_vote_graph)$offset
E(confidence_vote_graph)$offset2[E(confidence_vote_graph)$offset2==0] <- NA

## FIGURE 12 ##
l <- layout.circle
par(mar=c(0,0,0,0))
plot(confidence_vote_graph,
     layout=l,
     vertex.frame.color=NA,
     edge.label.color="black",
     vertex.label.family="Palatino",
     vertex.size=log(V(confidence_vote_graph)$size)*2.5,
     vertex.label.color="white",
     edge.label.family="Palatino",
     edge.label.cex=0.9,
     edge.width=log(E(confidence_vote_graph)$weight),
     vertex.label=V(confidence_vote_graph)$size)
# edge.label=E(confidence_vote_graph)$offset2)
#edge.label=gsub( "NA" , "" ,  sub("[ ]+", "", 
#              format(E(confidence_vote_graph)$weight2, 
#                    big.mark=",", scientific=FALSE))))


## Poisson regression ##

train_predictors <- data.frame(train_predictors)

# Calculate robust standard errors
cov.lm.fit <- vcovHC(lm.fit, type = "HC0")
rob.std.err <- sqrt(diag(cov.lm.fit))

# Create vector for naive standard errors
naive.std.err <- summary(lm.fit)$coefficients[,2]

# Table composition from http://www.ats.ucla.edu/stat/r/dae/poissonreg.htm
r.est <- cbind(Estimate = coef(lm.fit), `Robust SE` = rob.std.err, `Pr(>|z|)` = 2 *
                 pnorm(abs(coef(lm.fit)/rob.std.err), lower.tail = FALSE), LL = coef(lm.fit) - 1.96 *
                 rob.std.err, UL = coef(lm.fit) + 1.96 * rob.std.err)

## Table A3. Overdispersion test ##
# Goodness of fit and overdispersion test
goodfit <- goodfit(train_predictors$e_weight)
summary(goodfit)
dispersiontest(lm.fit)

## Table A4 - Regression summary ##
stargazer(lm.fit, lm.fit,
          title = "Regression summary",
          se=list(NULL, rob.std.err),
          column.labels=c("Naive SE","Robust SE"), 
          align=TRUE,
          digits = 6,
          style = "all") 
